#!/usr/bin/env python
#coding=utf-8

import wordStruct
import FIGModule
import TAGModule
import ConfigModule
import PUBLICMETHOD

import re
import os
import sys

#system default setting
reload(sys)
sys.setdefaultencoding('utf-8')

class Extract:

	def __init__(self, figIndexList, config):
		self.__fig = self.__caption = self.__title = self.__fileName = self.__no = ""
		self.__filteredKeywordList = []
		self.__figList = FIGModule.FigList()		
		self.__captionQueue = wordStruct.WordQueue()
		self.__fig = FIGModule.Fig()
		
		self.__figIndexList = figIndexList
		self.__config = config

	#use figName to get figFileName from FIGModule
	def getFigByfileName(self, figName):
		"""
		try:
			CSVPath = self.__figIndexList.getCSVPathByFigNo(figName)
			print CSVPath
			self.__figList.readCSV(CSVPath)
			self.__fig = self.__figList.getFigByFileName(figName)
		except Exception, ex:
			print ex
			print "The figName is illegal"
		"""
		CSVPath = self.__figIndexList.getCSVPathByFigFileName(figName)
		if CSVPath == 0:
				raise NameError('This fileName is illegal!')
		self.__figList.readCSV(CSVPath)
		self.__fig = self.__figList.getFigByFileName(figName)

	#get content from fig to attribute
	def getFigContent(self):
		self.__title = self.__fig.getTitle()
		self.__caption = self.__fig.getCaption()
		self.__fileName = self.__fig.getFilename()
		self.__no = self.__fig.getNo()

	"""
	def getLowerCaseContent(self):
		self.__title = self.__title.lower()
		self.__caption = self.__caption.lower()
		self.__fileName = self.__fileName.lower()
	"""


	def addtionalWordDel(self):
		#use regular expression

		#delete the number
		#self.__caption = re.sub(r'\d+',' ',self.__caption) 
		#delete some punctuation with . behind
		#self.__caption = re.sub(r'\W\.','',self.__caption)
		#delete some punctuation with , behind
		#self.__caption = re.sub(r'\W\,','',self.__caption)


		#delete the punctuation
		#delete some single alphas
		self.__caption = re.sub(r'\W[a-zA-Z]\W',' ',self.__caption)
		#delete all the word only contain number or "." or "-"
		self.__caption = re.sub(r'\W\-*[\d\.]+\W',' ',self.__caption)
		#delete all the "(" or ")" or "[" or "]" or "{" or "}" and the "," or "." behind them
		self.__caption = re.sub(r'[\(\)\[\]\{\}][\,\.]?','',self.__caption)
		#delete some consecutive punctuation with one blank behind
		self.__caption = re.sub(r'\W+\s',' ',self.__caption)
		#delete some punctuation with one blank before and some alpha behind
		self.__caption = re.sub(r'(?<=\s)\W(?=\w)','',self.__caption)
		#delete the punctuation in the end of a sentence
		self.__caption = self.__caption[:-1]
		

		#print self.__fig.getCaption()

	def splitWord(self):
		captionContent = self.__caption.split()
		for data in captionContent:
			self.__captionQueue.setWordQueueWithFirstAlph(data, data[0:1])
		del captionContent
		#print self.__captionQueue.getWordQueueContent()

	def deleteWord(self):
		delWordDic = self.__config.getDelWordDic()
		delWordDicIndex = self.__config.getDelWordDicIndexStr()
		numOfWord = self.__captionQueue.getCountQueue()

		#use i to iterate the caption word
		i = 0
		
		while  i < numOfWord:
			#this progress is word level

			#use j to iterate the delWordDic
			j = 0
			firstAlph = self.__captionQueue.getWordQueue()[i].getWordFirstAlph().lower()

			#if find out the first alph of the caption word is the same as the delWordDicIndex
			if delWordDicIndex.find(firstAlph) != -1:

				#use j to iterate the delWordDic
				while j < delWordDic[ord(firstAlph)].getCountQueue():

					if self.__captionQueue.getWordQueue()[i].getWordContent() == delWordDic[ord(firstAlph)].getWordQueue()[j].getWordContent()\
					or self.__captionQueue.getWordQueue()[i].getWordContent() == delWordDic[ord(firstAlph)].getWordQueue()[j].getWordContent().title():
						"""
						print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
						print self.__captionQueue.getWordQueue()[i].getWordContent()
						print delWordDic[ord(firstAlph)].getWordQueue()[j].getWordContent()
						print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
						"""
						self.__captionQueue.deleteWord(i)
						numOfWord -= 1
						i -= 1
						break

					j += 1
			i += 1

		#print self.__captionQueue.getWordQueueContent()

	def setWordWei(self):
		weiWordDic = self.__config.getWeiWordDic()
		weiWordDicIndex = self.__config.getWeiWordDicIndexStr()
		numOfWord = self.__captionQueue.getCountQueue()

		#use i to iterate the caption word
		i = 0
		
		while  i < numOfWord:
			#this progress is word level

			#use j to iterate the weiWordDic
			j = 0

			firstAlph = self.__captionQueue.getWordQueue()[i].getWordFirstAlph().lower()
			
			#print firstAlph

			#if find out the first alph of the caption word is the same as the weiWordDicIndex
			if weiWordDicIndex.find(firstAlph) != -1:

				#use j to iterate the weiWordDic
				while j < weiWordDic[ord(firstAlph)].getCountQueue():

					if self.__captionQueue.getWordQueue()[i].getWordContent() == weiWordDic[ord(firstAlph)].getWordQueue()[j].getWordContent()\
					or self.__captionQueue.getWordQueue()[i].getWordContent() == weiWordDic[ord(firstAlph)].getWordQueue()[j].getWordContent().title():
						"""
						print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
						print self.__captionQueue.getWordQueue()[i].getWordContent()
						print self.__config.getWeiWordDic()[ord(firstAlph)].getWordQueue()[j].getWordContent()
						print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
						"""

						#set word's weight
						if i != 0:
							self.__captionQueue.changeWordWeight(i - 1, weiWordDic[ord(firstAlph)].getWordQueue()[j].getWeightBefore())
						#print "after" + self.__config.getWeiWordDic()[ord(firstAlph)].getWordQueue()[j].getWeightAfter()
						self.__captionQueue.changeWordWeight(i + 1, weiWordDic[ord(firstAlph)].getWordQueue()[j].getWeightAfter())
						
						#delete the word that used to set word's weight
						self.__captionQueue.deleteWord(i)
						
						numOfWord -= 1
						i -= 1
						break

					j += 1
			i += 1

		#print self.__captionQueue.getWordQueue()[0].getWordContent()
		#print self.__captionQueue.getWordQueue()[0].getWordWeight()

	#set up some addtional rules to make the keyword more accurate
	def addtionalWordWei(self):		
		wordQueue = self.__captionQueue
		numOfWord = wordQueue.getCountQueue()		

		#These below are the rules for special word
		###################################################################################################
		#add up 3 points for the word that all of its alpha are upper case
		patternWithAllUpperCaseAlph = re.compile(r'^[A-Z,\W]+$')
		#add up 3 points for the word with punctuation
		patternWithPunctuation = re.compile(r'\W')
		#add up -1 point for the word only with number
		patternWithAllNumber = re.compile(r'\d+$')
		#add up 3 points for the word with number
		pattrenWithNumber = re.compile(r'\d')
		###################################################################################################

		def setAddtionalWei(i, pattern, point):
			if pattern.search(wordQueue.getWordQueue()[i].getWordContent()):
				wordQueue.changeWordWeight(i, point)

		i = 0

		while i < numOfWord:
			setAddtionalWei(i, patternWithAllUpperCaseAlph, 3)
			setAddtionalWei(i, patternWithPunctuation, 3)
			setAddtionalWei(i, patternWithAllNumber, -1)
			setAddtionalWei(i, pattrenWithNumber, 3)

			i += 1

	#Quick sort method shows below:
	########################################################################################################
	def partition(self, low, high):
		#use the first word's Wei as a pivot
		pivot = self.__captionQueue.getWordQueue()[low].getWordWeight()
		temp = self.__captionQueue.getWordQueue()[low]

		while low < high:
			#front to back, find out the Wei of word that less than pivot 
			while low < high and self.__captionQueue.getWordQueue()[high].getWordWeight() <= pivot:
				high -= 1
			
			if low < high:
				#change two words
				self.__captionQueue.getWordQueue()[low] = self.__captionQueue.getWordQueue()[high]
				low += 1

			#back to front, find out the Wei of word that more than pivot
			while low < high and self.__captionQueue.getWordQueue()[low].getWordWeight() >= pivot:
				low += 1

			if low < high:
				#change two words
				self.__captionQueue.getWordQueue()[high] = self.__captionQueue.getWordQueue()[low]
				high -= 1
		
		#move the less one to the temp's position
		self.__captionQueue.getWordQueue()[low] = temp
	
		#return the position
		return low

	def QuickSort(self, low, high):
		if (low < high):
			n = self.partition(low, high)
			self.QuickSort(low, n)
			self.QuickSort(n+1, high)

	def rankWordByScore(self):
		high = self.__captionQueue.getCountQueue() - 1
		low = 0
		self.QuickSort(low, high)
	
	#if one word' wei is more than 2 point, means it's a keyword. Besides, the number of keyword should be more than five.
	def filterWord(self):
		if self.__captionQueue.getCountQueue() <= 5:
			for data in self.__captionQueue.getWordQueue():
				self.__filteredKeywordList.append(data.getWordContent())
		else:
			for data in self.__captionQueue.getWordQueue():
				#data.getWordContent()
				#print data.getWordWeight()
				if data.getWordWeight() >= 2.0:
					self.__filteredKeywordList.append(data.getWordContent())
					#print "---------"
					#print self.__filteredKeywordList


	def getFilteredKeywordList(self):
		#print self.__filteredKeywordList
		return self.__filteredKeywordList
	########################################################################################################

	def showWordContent(self):
		for data in self.__captionQueue.getWordQueue():
			print data.getWordContent()
			print data.getWordWeight()

	def showFilterWord(self):
		for data in self.__filteredKeywordList:
			print data